1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

DARWIN <- read.csv("~/GitHub/FCA/Data/DARWIN/DARWIN.csv")
rownames(DARWIN) <- DARWIN$ID
DARWIN$ID <- NULL
DARWIN$class <- 1*(DARWIN$class=="P")
print(table(DARWIN$class))
#> 
#>  0  1 
#> 85 89

DARWIN[,1:ncol(DARWIN)] <- sapply(DARWIN,as.numeric)

signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(DARWIN) %in% c("class"));
DARWIN[,whof] <- signedlog(DARWIN[,whof])

1.2.0.1 Standarize the names for the reporting

studyName <- "DARWIN"
dataframe <- DARWIN
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
174 450
pander::pander(table(dataframe[,outcome]))
0 1
85 89

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9992136

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 450 , Uni p: 0.006350853 , Uncorrelated Base: 153 , Outcome-Driven Size: 0 , Base Size: 153 
#> 
#> 
 1 <R=0.999,r=0.975,N=   82>, Top: 41( 1 )[ 1 : 41 Fa= 41 : 0.975 ]( 41 , 41 , 0 ),<|>Tot Used: 82 , Added: 41 , Zero Std: 0 , Max Cor: 0.974
#> 
 2 <R=0.974,r=0.962,N=   82>, Top: 18( 1 )[ 1 : 18 Fa= 58 : 0.962 ]( 18 , 19 , 41 ),<|>Tot Used: 118 , Added: 19 , Zero Std: 0 , Max Cor: 0.960
#> 
 3 <R=0.960,r=0.955,N=   82>, Top: 8( 1 )[ 1 : 8 Fa= 66 : 0.955 ]( 8 , 8 , 58 ),<|>Tot Used: 134 , Added: 8 , Zero Std: 0 , Max Cor: 0.955
#> 
 4 <R=0.955,r=0.927,N=   43>, Top: 21( 1 )[ 1 : 21 Fa= 84 : 0.927 ]( 21 , 21 , 66 ),<|>Tot Used: 173 , Added: 21 , Zero Std: 0 , Max Cor: 0.927
#> 
 5 <R=0.927,r=0.914,N=   43>, Top: 10( 1 )[ 1 : 10 Fa= 89 : 0.914 ]( 9 , 10 , 84 ),<|>Tot Used: 187 , Added: 10 , Zero Std: 0 , Max Cor: 0.912
#> 
 6 <R=0.912,r=0.906,N=   43>, Top: 3( 1 )[ 1 : 3 Fa= 90 : 0.906 ]( 3 , 3 , 89 ),<|>Tot Used: 190 , Added: 3 , Zero Std: 0 , Max Cor: 0.906
#> 
 7 <R=0.906,r=0.853,N=  107>, Top: 50( 2 )[ 1 : 50 Fa= 131 : 0.853 ]( 49 , 51 , 90 ),<|>Tot Used: 278 , Added: 51 , Zero Std: 0 , Max Cor: 0.913
#> 
 8 <R=0.913,r=0.856,N=  107>, Top: 5( 1 )[ 1 : 5 Fa= 132 : 0.856 ]( 5 , 5 , 131 ),<|>Tot Used: 280 , Added: 5 , Zero Std: 0 , Max Cor: 0.853
#> 
 9 <R=0.853,r=0.800,N=   59>, Top: 30( 1 )[ 1 : 30 Fa= 145 : 0.800 ]( 29 , 29 , 132 ),<|>Tot Used: 311 , Added: 29 , Zero Std: 0 , Max Cor: 0.929
#> 
 10 <R=0.929,r=0.814,N=   59>, Top: 7( 1 )[ 1 : 7 Fa= 146 : 0.814 ]( 7 , 7 , 145 ),<|>Tot Used: 311 , Added: 7 , Zero Std: 0 , Max Cor: 0.821
#> 
 11 <R=0.821,r=0.800,N=   59>, Top: 3( 1 )[ 1 : 3 Fa= 147 : 0.800 ]( 3 , 3 , 146 ),<|>Tot Used: 311 , Added: 3 , Zero Std: 0 , Max Cor: 0.797
#> 
 12 <R=0.797,r=0.800,N=    0>
#> 
 [ 12 ], 0.7965355 Decor Dimension: 311 Nused: 311 . Cor to Base: 166 , ABase: 96 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

489

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

332

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.9

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

4.58

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7965355

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : mean_jerk_in_air6 200 : disp_index12 300 : mean_speed_in_air17 400 : gmrt_on_paper23




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_mean_jerk_in_air6 200 : disp_index12 300 : La_mean_speed_in_air17 400 : gmrt_on_paper23

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
total_time23 0.767 0.909 -0.366 0.736 6.93e-05 0.863
total_time15 0.775 1.062 -0.442 0.572 4.78e-01 0.844
air_time23 0.599 0.766 -0.374 0.715 2.31e-02 0.844
air_time15 0.684 1.112 -0.506 0.669 7.09e-01 0.829
total_time17 0.806 1.082 -0.400 0.966 3.10e-02 0.824
paper_time23 0.690 1.106 -0.435 0.703 6.55e-01 0.814
air_time17 0.674 0.980 -0.378 0.863 8.86e-02 0.806
paper_time17 0.664 1.045 -0.413 0.929 1.79e-01 0.796
total_time6 0.680 1.069 -0.364 0.665 7.13e-01 0.790
air_time16 0.426 0.841 -0.414 0.650 8.51e-01 0.787


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
air_time23 0.5993 0.766 -0.37359 0.715 2.31e-02 0.844
air_time15 0.6835 1.112 -0.50588 0.669 7.09e-01 0.829
air_time17 0.6742 0.980 -0.37796 0.863 8.86e-02 0.806
air_time16 0.4258 0.841 -0.41386 0.650 8.51e-01 0.787
disp_index23 0.5808 0.924 -0.35306 0.816 3.70e-01 0.787
air_time6 0.5641 0.982 -0.41988 0.746 6.19e-01 0.784
air_time7 0.5315 0.829 -0.23828 0.882 7.92e-03 0.779
gmrt_in_air7 -0.4478 0.811 0.42274 0.794 9.97e-01 0.775
air_time2 0.3619 0.810 -0.44747 0.699 1.52e-01 0.773
mean_acc_in_air17 -0.5049 0.918 0.33611 0.852 3.37e-01 0.767
La_paper_time21 0.3857 0.568 -0.00319 0.377 1.32e-01 0.735
La_total_time5 0.2557 0.509 0.00354 0.164 1.13e-07 0.730
La_mean_speed_on_paper13 -0.0359 0.115 0.02486 0.195 1.90e-05 0.728
La_mean_speed_on_paper2 -0.1135 0.389 0.08430 0.290 1.43e-08 0.716
La_total_time9 0.2789 0.434 -0.05732 0.444 5.04e-01 0.712

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.13 174 0.387


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
air_time23 0.5993 0.766 -0.37359 0.715 2.31e-02 0.844 0.844 1
air_time15 0.6835 1.112 -0.50588 0.669 7.09e-01 0.829 0.829 1
air_time17 0.6742 0.980 -0.37796 0.863 8.86e-02 0.806 0.806 1
air_time16 0.4258 0.841 -0.41386 0.650 8.51e-01 0.787 0.787 1
disp_index23 0.5808 0.924 -0.35306 0.816 3.70e-01 0.787 0.787 1
air_time6 0.5641 0.982 -0.41988 0.746 6.19e-01 0.784 0.784 1
air_time7 0.5315 0.829 -0.23828 0.882 7.92e-03 0.779 0.779 1
gmrt_in_air7 -0.4478 0.811 0.42274 0.794 9.97e-01 0.775 0.775 1
total_time9 NA 0.5997 0.996 -0.35254 0.638 7.86e-01 0.774 0.774 NA
air_time2 0.3619 0.810 -0.44747 0.699 1.52e-01 0.773 0.773 1
mean_acc_in_air17 -0.5049 0.918 0.33611 0.852 3.37e-01 0.767 0.767 2
La_paper_time21 -0.871disp_index21 + 1.000paper_time21 0.3857 0.568 -0.00319 0.377 1.32e-01 0.735 0.542 -1
La_total_time5 -0.813paper_time5 + 1.000total_time5 0.2557 0.509 0.00354 0.164 1.13e-07 0.730 0.674 -1
La_mean_speed_on_paper13 -0.971gmrt_on_paper13 + 1.000mean_speed_on_paper13 -0.0359 0.115 0.02486 0.195 1.90e-05 0.728 0.626 -1
mean_speed_on_paper2 NA -0.3422 0.901 0.35546 0.928 4.91e-01 0.720 0.720 NA
La_mean_speed_on_paper2 -0.878gmrt_on_paper2 + 1.000mean_speed_on_paper2 -0.1135 0.389 0.08430 0.290 1.43e-08 0.716 0.720 -1
La_total_time9 -0.891air_time9 + 1.000total_time9 0.2789 0.434 -0.05732 0.444 5.04e-01 0.712 0.774 -1
air_time9 NA 0.3602 1.007 -0.33148 0.731 6.92e-01 0.699 0.699 1
total_time5 NA 0.3576 1.238 -0.21141 0.766 6.87e-01 0.674 0.674 NA
gmrt_on_paper2 NA -0.2603 0.961 0.30870 1.034 9.26e-01 0.663 0.663 2
paper_time5 NA 0.1254 1.429 -0.26454 0.926 8.36e-01 0.629 0.629 1
mean_speed_on_paper13 NA -0.3097 0.976 0.13242 0.759 9.69e-01 0.626 0.626 NA
gmrt_on_paper13 NA -0.2820 1.004 0.11081 0.765 9.59e-01 0.606 0.606 2
paper_time21 NA 0.1398 1.174 -0.08999 1.084 8.16e-01 0.542 0.542 NA
disp_index21 NA -0.2822 1.300 -0.09963 0.981 7.01e-02 0.538 0.538 1